Library

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(GGally) # ggplot2-based visualization of correlations
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(factoextra) # ggplot2-based visualization of pca
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(countrycode)
library(rworldmap)
## Loading required package: sp
## ### Welcome to rworldmap ###
## For a short introduction type :   vignette('rworldmap')
library(mice)
## 
## Attaching package: 'mice'
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
library(plotly)
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
library(readr)

library(cluster)
library(mclust)
## Package 'mclust' version 6.0.1
## Type 'citation("mclust")' for citing this R package in publications.
## 
## Attaching package: 'mclust'
## 
## The following object is masked from 'package:purrr':
## 
##     map

Data

rm(list = ls())
lifeexpec = read.csv("./data/lifeexpec.csv")
obesity = read.csv("./data/obesity.csv")
tobacco = read.csv("./data/tobacco.csv")
doctor_density = read.csv("./data/doctor_density.csv")
gob_expenditure = read.csv("./data/gov_expenditure.csv")
road_death = read.csv("./data/road_death.csv")
birth_by_skilled = read.csv("./data/birth_by_skilled.csv")
maternal_death = read.csv("./data/maternal_death.csv")

lifeexpec = lifeexpec |> select(GEO_NAME_SHORT, DIM_TIME, VALUE_NUMERIC)
lifeexpec |> distinct(DIM_TIME)
##   DIM_TIME
## 1     2000
## 2     2010
## 3     2015
## 4     2019
lifeexpec = lifeexpec |> filter(DIM_TIME == "2015")
colnames(lifeexpec) = c("country", "year", "life_value")
lifeexpec <- lifeexpec |> distinct(country, .keep_all = TRUE)

obesity = obesity |> select(GEO_NAME_SHORT, DIM_TIME, VALUE_NUMERIC)
obesity |> distinct(DIM_TIME)
##    DIM_TIME
## 1      1975
## 2      1976
## 3      1977
## 4      1978
## 5      1979
## 6      1980
## 7      1981
## 8      1982
## 9      1983
## 10     1984
## 11     1985
## 12     1986
## 13     1987
## 14     1988
## 15     1989
## 16     1990
## 17     1991
## 18     1992
## 19     1993
## 20     1994
## 21     1995
## 22     1996
## 23     1997
## 24     1998
## 25     1999
## 26     2000
## 27     2001
## 28     2002
## 29     2003
## 30     2004
## 31     2005
## 32     2006
## 33     2007
## 34     2008
## 35     2009
## 36     2010
## 37     2011
## 38     2012
## 39     2013
## 40     2014
## 41     2015
## 42     2016
obesity = obesity |> filter(DIM_TIME == "2015")
obesity = obesity |> select(-DIM_TIME)
colnames(obesity) = c("country", "obesity_value")
obesity <- obesity |> distinct(country, .keep_all = TRUE)

tobacco = tobacco |> select(GEO_NAME_SHORT, DIM_TIME, VALUE_NUMERIC)
tobacco |> distinct(DIM_TIME)
##   DIM_TIME
## 1     2000
## 2     2005
## 3     2010
## 4     2015
## 5     2018
## 6     2019
## 7     2020
## 8     2025
tobacco = tobacco |> filter(DIM_TIME == "2015")
tobacco = tobacco |> select(-DIM_TIME)
colnames(tobacco) = c("country", "tobacco_value")
tobacco <- tobacco |> distinct(country, .keep_all = TRUE)

doctor_density = doctor_density |> select(GEO_NAME_SHORT, DIM_TIME, VALUE_NUMERIC)
doctor_density |> distinct(DIM_TIME)
##    DIM_TIME
## 1      1990
## 2      1991
## 3      1992
## 4      1993
## 5      1994
## 6      1995
## 7      1996
## 8      1997
## 9      1998
## 10     1999
## 11     2000
## 12     2001
## 13     2002
## 14     2003
## 15     2004
## 16     2005
## 17     2006
## 18     2007
## 19     2008
## 20     2009
## 21     2010
## 22     2011
## 23     2012
## 24     2013
## 25     2014
## 26     2015
## 27     2016
## 28     2017
## 29     2018
## 30     2019
## 31     2020
## 32     2021
doctor_density = doctor_density |> filter(DIM_TIME == "2015")
doctor_density = doctor_density |> select(-DIM_TIME)
colnames(doctor_density) = c("country", "doc_den_value")
doctor_density <- doctor_density |> distinct(country, .keep_all = TRUE)

gob_expenditure = gob_expenditure |> select(GEO_NAME_SHORT, DIM_TIME, VALUE_NUMERIC)
gob_expenditure |> distinct(DIM_TIME)
##    DIM_TIME
## 1      2000
## 2      2001
## 3      2002
## 4      2003
## 5      2004
## 6      2005
## 7      2006
## 8      2007
## 9      2008
## 10     2009
## 11     2010
## 12     2011
## 13     2012
## 14     2013
## 15     2014
## 16     2015
## 17     2016
## 18     2017
## 19     2018
## 20     2019
## 21     2020
gob_expenditure = gob_expenditure |> filter(DIM_TIME == "2015")
gob_expenditure = gob_expenditure |> select(-DIM_TIME)
colnames(gob_expenditure) = c("country", "gob_expenditure")
gob_expenditure <- gob_expenditure |> distinct(country, .keep_all = TRUE)

road_death = road_death |> select(GEO_NAME_SHORT, DIM_TIME, VALUE_NUMERIC)
road_death |> distinct(DIM_TIME)
##    DIM_TIME
## 1      2000
## 2      2001
## 3      2002
## 4      2003
## 5      2004
## 6      2005
## 7      2006
## 8      2007
## 9      2008
## 10     2009
## 11     2010
## 12     2011
## 13     2012
## 14     2013
## 15     2014
## 16     2015
## 17     2016
## 18     2017
## 19     2018
## 20     2019
road_death = road_death |> filter(DIM_TIME == "2015")
road_death = road_death |> select(-DIM_TIME)
colnames(road_death) = c("country", "road_death")
road_death <- road_death |> distinct(country, .keep_all = TRUE)

birth_by_skilled = birth_by_skilled |> select(GEO_NAME_SHORT, DIM_TIME, VALUE_NUMERIC)
birth_by_skilled |> distinct(DIM_TIME)
##     DIM_TIME
## 1       2001
## 2       2002
## 3       2003
## 4       2004
## 5       2006
## 6       2007
## 7       2008
## 8       2009
## 9       2011
## 10      2012
## 11      2013
## 12      2014
## 13      2016
## 14      2017
## 15      2018
## 16      2019
## 17      2021
## 18      2000
## 19      2005
## 20      2010
## 21      2015
## 22      2020
## 23      2022
## 24 2019-2020
## 25 2016-2017
## 26 2013-2014
## 27 2011-2012
## 28 2010-2011
## 29 2008-2009
## 30 2017-2018
## 31 2012-2013
## 32 2018-2019
## 33 2006-2007
## 34 2015-2016
## 35 1999-2000
## 36 2003-2004
## 37 2009-2010
## 38 2005-2006
## 39 2021-2022
## 40 2014-2015
## 41 2004-2005
## 42 2002-2003
## 43 2000-2001
## 44 2007-2008
## 45 2019-2021
## 46 2013-2015
## 47 2001-2003
## 48 2001-2002
## 49 2016-2018
## 50 2004-2006
## 51 2012-2014
## 52 2020-2021
## 53 2010-2013
birth_by_skilled = birth_by_skilled |> filter(DIM_TIME == "2015")
birth_by_skilled = birth_by_skilled |> select(-DIM_TIME)
colnames(birth_by_skilled) = c("country", "birth_by_skilled")
birth_by_skilled <- birth_by_skilled |> distinct(country, .keep_all = TRUE)

maternal_death = maternal_death |> select(GEO_NAME_SHORT, DIM_TIME, VALUE_NUMERIC)
maternal_death |> distinct(DIM_TIME)
##    DIM_TIME
## 1      1985
## 2      1986
## 3      1987
## 4      1988
## 5      1989
## 6      1990
## 7      1991
## 8      1992
## 9      1993
## 10     1994
## 11     1995
## 12     1996
## 13     1997
## 14     1998
## 15     1999
## 16     2000
## 17     2001
## 18     2002
## 19     2003
## 20     2004
## 21     2005
## 22     2006
## 23     2007
## 24     2008
## 25     2009
## 26     2010
## 27     2011
## 28     2012
## 29     2013
## 30     2014
## 31     2015
## 32     2016
## 33     2017
## 34     2018
## 35     2019
## 36     2020
maternal_death = maternal_death |> filter(DIM_TIME == "2015")
maternal_death = maternal_death |> select(-DIM_TIME)
colnames(maternal_death) = c("country", "maternal_death")
maternal_death <- maternal_death |> distinct(country, .keep_all = TRUE)

# input continent for grouping
country = lifeexpec |> select(country)
continent = read.csv("./data/Countries-Continents.csv")
continent$Country[continent$Country == "Korea, South"] <- "Republic of Korea"

name = left_join(country, continent, by = join_by("country" == "Country"))
name = na.omit(name)


rm(df)
## Warning in rm(df): object 'df' not found
df = left_join(lifeexpec, obesity, by = join_by(country))
df = left_join(df, tobacco, by = join_by(country))
df = left_join(df, doctor_density, by = join_by(country))
df = left_join(df, gob_expenditure, by = join_by(country))
df = left_join(df, road_death, by = join_by(country))
df = left_join(df, birth_by_skilled, by = join_by(country))
df = left_join(df, maternal_death, by = join_by(country))
df = left_join(df, name, by = join_by(country))
df = df |> relocate(Continent, .after = country)
df = df |>  drop_na(Continent)

df = df |> rename("continent" = "Continent")

table(is.na(df))
## 
## FALSE  TRUE 
##  1581   168
summary(df)
##    country           continent              year        life_value   
##  Length:159         Length:159         Min.   :2015   Min.   :47.67  
##  Class :character   Class :character   1st Qu.:2015   1st Qu.:64.69  
##  Mode  :character   Mode  :character   Median :2015   Median :72.58  
##                                        Mean   :2015   Mean   :71.71  
##                                        3rd Qu.:2015   3rd Qu.:77.74  
##                                        Max.   :2015   Max.   :84.29  
##                                                                      
##  obesity_value   tobacco_value   doc_den_value    gob_expenditure 
##  Min.   : 2.10   Min.   : 0.20   Min.   : 0.315   Min.   : 1.330  
##  1st Qu.: 8.50   1st Qu.: 2.70   1st Qu.: 4.612   1st Qu.: 6.615  
##  Median :20.50   Median : 7.75   Median :22.679   Median : 9.340  
##  Mean   :18.95   Mean   :11.30   Mean   :22.212   Mean   : 9.987  
##  3rd Qu.:25.40   3rd Qu.:19.68   3rd Qu.:33.852   3rd Qu.:12.890  
##  Max.   :53.90   Max.   :39.10   Max.   :77.586   Max.   :29.490  
##  NA's   :2       NA's   :21      NA's   :65       NA's   :3       
##    road_death    birth_by_skilled maternal_death    
##  Min.   : 0.00   Min.   : 39.70   Min.   :   1.314  
##  1st Qu.: 6.26   1st Qu.: 96.30   1st Qu.:  10.786  
##  Median :14.24   Median : 99.05   Median :  52.027  
##  Mean   :16.00   Mean   : 94.31   Mean   : 162.763  
##  3rd Qu.:22.45   3rd Qu.: 99.90   3rd Qu.: 210.396  
##  Max.   :63.37   Max.   :100.00   Max.   :1224.722  
##                  NA's   :77
dim(df)
## [1] 159  11

I obtained dataset from the World Health Organization.

Imputation NAs

sapply(df, function(x) sum(is.na(x))*100/nrow(df))
##          country        continent             year       life_value 
##         0.000000         0.000000         0.000000         0.000000 
##    obesity_value    tobacco_value    doc_den_value  gob_expenditure 
##         1.257862        13.207547        40.880503         1.886792 
##       road_death birth_by_skilled   maternal_death 
##         0.000000        48.427673         0.000000
m = 4 # number of multiple imputations, we are going to make 4 iterations, we're going to predict missing values 4 times.
mice_mod = mice(df, m = m, method='rf') # machine learning tool, rf = random forest
## 
##  iter imp variable
##   1   1  obesity_value  tobacco_value  doc_den_value  gob_expenditure  birth_by_skilled
##   1   2  obesity_value  tobacco_value  doc_den_value  gob_expenditure  birth_by_skilled
##   1   3  obesity_value  tobacco_value  doc_den_value  gob_expenditure  birth_by_skilled
##   1   4  obesity_value  tobacco_value  doc_den_value  gob_expenditure  birth_by_skilled
##   2   1  obesity_value  tobacco_value  doc_den_value  gob_expenditure  birth_by_skilled
##   2   2  obesity_value  tobacco_value  doc_den_value  gob_expenditure  birth_by_skilled
##   2   3  obesity_value  tobacco_value  doc_den_value  gob_expenditure  birth_by_skilled
##   2   4  obesity_value  tobacco_value  doc_den_value  gob_expenditure  birth_by_skilled
##   3   1  obesity_value  tobacco_value  doc_den_value  gob_expenditure  birth_by_skilled
##   3   2  obesity_value  tobacco_value  doc_den_value  gob_expenditure  birth_by_skilled
##   3   3  obesity_value  tobacco_value  doc_den_value  gob_expenditure  birth_by_skilled
##   3   4  obesity_value  tobacco_value  doc_den_value  gob_expenditure  birth_by_skilled
##   4   1  obesity_value  tobacco_value  doc_den_value  gob_expenditure  birth_by_skilled
##   4   2  obesity_value  tobacco_value  doc_den_value  gob_expenditure  birth_by_skilled
##   4   3  obesity_value  tobacco_value  doc_den_value  gob_expenditure  birth_by_skilled
##   4   4  obesity_value  tobacco_value  doc_den_value  gob_expenditure  birth_by_skilled
##   5   1  obesity_value  tobacco_value  doc_den_value  gob_expenditure  birth_by_skilled
##   5   2  obesity_value  tobacco_value  doc_den_value  gob_expenditure  birth_by_skilled
##   5   3  obesity_value  tobacco_value  doc_den_value  gob_expenditure  birth_by_skilled
##   5   4  obesity_value  tobacco_value  doc_den_value  gob_expenditure  birth_by_skilled
## Warning: Number of logged events: 3
df_imput <- complete(mice_mod, action=m) # replace missiong value with the mice_mod

Data frame

df_imput_n = df_imput[,4:ncol(df_imput)]

table(is.na(df_imput_n))
## 
## FALSE 
##  1272
name = df_imput$country
continent = df_imput$continent

library(corrplot)
## corrplot 0.92 loaded
corrplot(cor(df_imput_n), method = "number")

Boxplot for checking the distribution of variables

library(ggplot2)
library(tidyr)
df_long <- df_imput_n |> gather(variable, value)

# Create boxplot
ggplot(df_long, aes(x = variable, y = value)) +
  geom_boxplot() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  theme_minimal()

# log transformation for the variable of maternal_death
df_imput$maternal_death = log(df_imput$maternal_death)

df_long <- df_imput[,4:ncol(df_imput)] |> gather(variable, value)

ggplot(df_long, aes(x = variable, y = value)) +
  geom_boxplot() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  theme_minimal()

Graph 1

ggplot(df_imput, aes(x=obesity_value, y=life_value, group=continent)) + 
  geom_point(alpha=0.9) + #geom_smooth(se=F, size=0.3) +
  facet_wrap(~ continent) +
  scale_color_discrete() +
  theme_minimal()+ theme(legend.position="none") + 
  labs(title = "World countries: life expectancy vs obesity", 
       caption="Source: World Health Organization", 
       x = "obesity_value", y = "Life expectancy at birth (in years)")

Graph 2

ggplot(df_imput, aes(x=tobacco_value, y=life_value, group=continent)) + 
  geom_point(alpha=0.9) + #geom_smooth(se=F, size=0.3) +
  facet_wrap(~ continent) +
  scale_color_discrete() +
  theme_minimal()+ theme(legend.position="none") + 
  labs(title = "World countries: life expectancy vs tobacco usage", 
       caption="Source: World Health Organization", 
       x = "tobacco_value", y = "Life expectancy at birth (in years)")

Graph 3

ggplot(df_imput, aes(x=doc_den_value, y=life_value, group=continent)) + 
  geom_point(alpha=0.9) + #geom_smooth(se=F, size=0.3) +
  facet_wrap(~ continent) +
  scale_color_discrete() +
  theme_minimal()+ theme(legend.position="none") + 
  labs(title = "World countries: life expectancy vs doctor density", 
       caption="Source: World Health Organization", 
       x = "doctor density", y = "Life expectancy at birth (in years)")

## Graph 4

ggplot(df_imput, aes(x=gob_expenditure, y=life_value, group=continent)) + 
  geom_point(alpha=0.9) + #geom_smooth(se=F, size=0.3) +
  facet_wrap(~ continent) +
  scale_color_discrete() +
  theme_minimal()+ theme(legend.position="none") + 
  labs(title = "World countries: life expectancy vs goverment health expenditure ", 
       caption="Source: World Health Organization", 
       x = "goverment health expenditure", y = "Life expectancy at birth (in years)")

ggplot(df_imput, aes(x=gob_expenditure, y=life_value, group=continent)) + 
  geom_point(alpha=0.9) + #geom_smooth(se=F, size=0.3) +
  facet_wrap(~ continent) +
  scale_color_discrete() +
  theme_minimal()+ theme(legend.position="none") + 
  labs(title = "World countries: life expectancy vs goverment health expenditure ", 
       caption="Source: World Health Organization", 
       x = "goverment health expenditure", y = "Life expectancy at birth (in years)") 

df_imput |> 
  group_by(continent) |> 
  do(tidy(lm(life_value ~ gob_expenditure, data = .)))|> 
  select(continent, term, estimate) |> 
  spread(term, estimate) 
## # A tibble: 6 × 3
## # Groups:   continent [6]
##   continent     `(Intercept)` gob_expenditure
##   <chr>                 <dbl>           <dbl>
## 1 Africa                 60.9           0.384
## 2 Asia                   66.3           0.805
## 3 Europe                 70.0           0.712
## 4 North America          67.3           0.530
## 5 Oceania                58.2           1.13 
## 6 South America          65.0           0.792
g_Africa = df_imput |> select(country, continent, gob_expenditure, life_value) |> 
  filter(continent == "Africa") |> 
  ggplot(aes(x=gob_expenditure, y = life_value)) + 
  geom_point(alpha = 0.9) +
  geom_abline(slope = 0.4058175, intercept = 60.74990, color = "red")

g_Asia = df_imput |> select(country, continent, gob_expenditure, life_value) |> 
  filter(continent == "Asia") |> 
  ggplot(aes(x=gob_expenditure, y = life_value)) + 
  geom_point(alpha = 0.9) +
  geom_abline(slope = 0.8045696, intercept = 66.34065, color = "red")

g_Europe = df_imput |> select(country, continent, gob_expenditure, life_value) |> 
  filter(continent == "Europe") |> 
  ggplot(aes(x=gob_expenditure, y = life_value)) + 
  geom_point(alpha = 0.9) +
  geom_abline(slope = 0.7120162, intercept = 70.04078, color = "red")


g_North_America = df_imput |> select(country, continent, gob_expenditure, life_value) |> 
  filter(continent == "North America") |> 
  ggplot(aes(x=gob_expenditure, y = life_value)) + 
  geom_point(alpha = 0.9) +
  geom_abline(slope = 0.5301466, intercept = 67.31540, color = "red")

g_Oceania = df_imput |> select(country, continent, gob_expenditure, life_value) |> 
  filter(continent == "Oceania") |> 
  ggplot(aes(x=gob_expenditure, y = life_value)) + 
  geom_point(alpha = 0.9) +
  geom_abline(slope = 1.1299704, intercept = 58.16559, color = "red")

g_South_America = df_imput |> select(country, continent, gob_expenditure, life_value) |> 
  filter(continent == "South America") |> 
  ggplot(aes(x=gob_expenditure, y = life_value)) + 
  geom_point(alpha = 0.9) +
  geom_abline(slope = 0.7915068, intercept = 64.96808, color = "red")

library(patchwork)
g_Africa + g_Asia + g_Europe + g_North_America + g_Oceania + g_South_America +
   plot_annotation(
    title = "Merged Plot",
    tag_levels = "a",
    tag_suffix = ")"
  ) 

Graph 5

ggplot(df_imput, aes(x=road_death, y=life_value, group=continent)) + 
  geom_point(alpha=0.9) + #geom_smooth(se=F, size=0.3) +
  facet_wrap(~ continent) +
  scale_color_discrete() +
  theme_minimal()+ theme(legend.position="none") + 
  labs(title = "World countries: life expectancy vs road death", 
       caption="Source: World Health Organization", 
       x = "road death", y = "Life expectancy at birth (in years)")

df_imput |> 
  group_by(continent) |> 
  do(tidy(lm(life_value ~ road_death, data = .)))|> 
  select(continent, term, estimate) |> 
  spread(term, estimate) 
## # A tibble: 6 × 3
## # Groups:   continent [6]
##   continent     `(Intercept)` road_death
##   <chr>                 <dbl>      <dbl>
## 1 Africa                 70.1    -0.259 
## 2 Asia                   74.5    -0.0731
## 3 Europe                 80.7    -0.298 
## 4 North America          76.2    -0.142 
## 5 Oceania                73.3    -0.347 
## 6 South America          88.3    -0.723
g_Africa2 = df_imput |> select(country, continent, road_death, life_value) |> 
  filter(continent == "Africa") |> 
  ggplot(aes(x=road_death, y = life_value)) + 
  geom_point(alpha = 0.9) +
  geom_abline(slope = -0.25900186, intercept = 70.11643, color = "red")

g_Asia2 = df_imput |> select(country, continent, road_death, life_value) |> 
  filter(continent == "Asia") |> 
  ggplot(aes(x=road_death, y = life_value)) + 
  geom_point(alpha = 0.9) +
  geom_abline(slope = -0.07312883, intercept = 74.50443, color = "red")

g_Europe2 = df_imput |> select(country, continent, road_death, life_value) |> 
  filter(continent == "Europe") |> 
  ggplot(aes(x=road_death, y = life_value)) + 
  geom_point(alpha = 0.9) +
  geom_abline(slope = -0.29809254, intercept = 80.68437, color = "red")

g_North_America2 = df_imput |> select(country, continent, road_death, life_value) |> 
  filter(continent == "North America") |> 
  ggplot(aes(x=road_death, y = life_value)) + 
  geom_point(alpha = 0.9) +
  geom_abline(slope = -0.14201323, intercept = 76.20349, color = "red")

g_Oceania2 = df_imput |> select(country, continent, road_death, life_value) |> 
  filter(continent == "Oceania") |> 
  ggplot(aes(x=road_death, y = life_value)) + 
  geom_point(alpha = 0.9) +
  geom_abline(slope = -0.34742866, intercept = 73.30715, color = "red")

g_South_America2 = df_imput |> select(country, continent, road_death, life_value) |> 
  filter(continent == "South America") |> 
  ggplot(aes(x=road_death, y = life_value)) + 
  geom_point(alpha = 0.9) +
  geom_abline(slope = -0.72257724, intercept = 88.29099, color = "red")

library(patchwork)
g_Africa2 + g_Asia2 + g_Europe2 + g_North_America2 + g_Oceania2 + g_South_America2 +
   plot_annotation(
    title = "Merged Plot",
    tag_levels = "a",
    tag_suffix = ")"
  ) 

Graph 6

ggplot(df_imput, aes(x=birth_by_skilled, y=life_value, group=continent)) + 
  geom_point(alpha=0.9) + #geom_smooth(se=F, size=0.3) +
  facet_wrap(~ continent) +
  scale_color_discrete() +
  theme_minimal()+ theme(legend.position="none") + 
  labs(title = "World countries: life expectancy vs Births attended by skilled health personnel", 
       caption="Source: World Health Organization", 
       x = "Births attended by skilled health personnel", y = "Life expectancy at birth (in years)")

df_imput |> 
  group_by(continent) |> 
  do(tidy(lm(life_value ~ birth_by_skilled, data = .)))|> 
  select(continent, term, estimate) |> 
  spread(term, estimate) 
## # A tibble: 6 × 3
## # Groups:   continent [6]
##   continent     `(Intercept)` birth_by_skilled
##   <chr>                 <dbl>            <dbl>
## 1 Africa                 55.2           0.110 
## 2 Asia                   61.4           0.130 
## 3 Europe                 83.6          -0.0500
## 4 North America          66.0           0.0885
## 5 Oceania                42.8           0.304 
## 6 South America          29.6           0.482
g_Asia3 = df_imput |> select(country, continent, birth_by_skilled, life_value) |> 
  filter(continent == "Asia") |> 
  ggplot(aes(x=birth_by_skilled, y = life_value)) + 
  geom_point(alpha = 0.9) +
  geom_abline(slope = 0.15721923, intercept = 58.71548, color = "red")

g_North_America3 = df_imput |> select(country, continent, birth_by_skilled, life_value) |> 
  filter(continent == "North America") |> 
  ggplot(aes(x=birth_by_skilled, y = life_value)) + 
  geom_point(alpha = 0.9) +
  geom_abline(slope = 0.19524893, intercept = 56.22726, color = "red")

library(patchwork)
g_Asia3 + g_North_America3 +
   plot_annotation(
    title = "Merged Plot",
    tag_levels = "a",
    tag_suffix = ")"
  ) 

Graph 7

ggplot(df_imput, aes(x=maternal_death, y=life_value, group=continent)) + 
  geom_point(alpha=0.9) + #geom_smooth(se=F, size=0.3) +
  facet_wrap(~ continent) +
  scale_color_discrete() +
  theme_minimal()+ theme(legend.position="none") + 
  labs(title = "World countries: life expectancy vs maternal mortality ratio", 
       caption="Source: World Health Organization", 
       x = "maternal mortality ratio", y = "Life expectancy at birth (in years)")

PCA

#install.packages("prcomp")
#library(prcomp)
table(is.na(df_imput))
## 
## FALSE 
##  1749
pca = prcomp(df_imput_n, scale = TRUE)
summary(pca)
## Importance of components:
##                           PC1    PC2     PC3    PC4     PC5     PC6     PC7
## Standard deviation     2.0892 0.9260 0.85387 0.7823 0.68531 0.62932 0.60479
## Proportion of Variance 0.5456 0.1072 0.09114 0.0765 0.05871 0.04951 0.04572
## Cumulative Proportion  0.5456 0.6528 0.74390 0.8204 0.87910 0.92861 0.97433
##                            PC8
## Standard deviation     0.45317
## Proportion of Variance 0.02567
## Cumulative Proportion  1.00000
library(factoextra)
fviz_screeplot(pca, addlabels = TRUE)

First Component

barplot(pca$rotation[,1], las=2, col="darkblue")

fviz_contrib(pca, choice = "var", axes = 1)

name[order(pca$x[,1])][1:10]
##  [1] "Central African Republic" "Liberia"                 
##  [3] "Guinea-Bissau"            "Somalia"                 
##  [5] "Benin"                    "Lesotho"                 
##  [7] "Mali"                     "Zimbabwe"                
##  [9] "Chad"                     "Guinea"
name[order(pca$x[,1], decreasing=T)][1:10]
##  [1] "Cuba"      "Greece"    "Austria"   "Germany"   "Spain"     "Norway"   
##  [7] "Ireland"   "Sweden"    "Australia" "Uruguay"

Second Component

barplot(pca$rotation[,2], las=2, col="darkblue")

name[order(pca$x[,2])][1:5]
## [1] "Saudi Arabia" "Kuwait"       "El Salvador"  "Zimbabwe"     "Qatar"
name[order(pca$x[,2], decreasing=T)][1:5]
## [1] "India"            "Papua New Guinea" "Greece"           "Afghanistan"     
## [5] "Bulgaria"
fviz_contrib(pca, choice = "var", axes = 2)

## plot first two scores

data.frame(z1=pca$x[,1],z2=pca$x[,2]) %>% 
  ggplot(aes(z1,z2,label=name,color=continent)) + geom_point(size=0) +
  labs(title="First two principal components (scores)", x="PC1", y="PC2") + #guides(color=guide_legend(title="HDI"))+
  theme_bw() +theme(legend.position="bottom") + geom_text(size=3, hjust=0.6, vjust=0, check_overlap = TRUE) 

# The two first PCs seem independent

data.frame(z1=-pca$x[,1],Region=df_imput$country) %>% group_by(Region) %>% summarise(mean=mean(z1), n=n()) %>% arrange(desc(mean))
## # A tibble: 159 × 3
##    Region                    mean     n
##    <chr>                    <dbl> <int>
##  1 Central African Republic  5.45     1
##  2 Liberia                   4.36     1
##  3 Guinea-Bissau             4.14     1
##  4 Somalia                   3.95     1
##  5 Benin                     3.94     1
##  6 Lesotho                   3.88     1
##  7 Mali                      3.88     1
##  8 Zimbabwe                  3.85     1
##  9 Chad                      3.81     1
## 10 Guinea                    3.79     1
## # ℹ 149 more rows

Conclusion

# Map our PCA index in a map:
map = data.frame(country=name, value=-pca$x[,1])
#Convert the country code into iso3c using the function countrycode()
map$country = countrycode(map$country, 'country.name', 'iso3c')
#Create data object supporting the map
matched <- joinCountryData2Map(map, joinCode = "ISO3",
                               nameJoinColumn = "country")
## 159 codes from your data successfully matched countries in the map
## 0 codes from your data failed to match with a country code in the map
## 84 codes from the map weren't represented in your data
#Draw the map
mapCountryData(matched,nameColumnToPlot="value",missingCountryCol = "white",
               addLegend = TRUE, borderCol = "#C7D9FF",
               catMethod = "pretty", colourPalette = "terrain",
               mapTitle = c("PCA1 by Country"), lwd=1)
## You asked for 7 categories, 10 were used due to pretty() classification

## Clustering

fit_5 = kmeans(df_imput_n, centers = 5, nstart = 1000)
fit_4 = kmeans(df_imput_n, centers = 4, nstart = 1000)
fit_3 = kmeans(df_imput_n, centers = 3, nstart = 1000)

Interpretation of centers:

centers=fit_5$centers

barplot(centers[1,], las=2, col="darkblue")

barplot(centers[2,], las=2, col="darkblue")

barplot(centers[3,], las=2, col="darkblue")

barplot(centers[4,], las=2, col="darkblue")

barplot(centers[5,], las=2, col="darkblue")

centers=fit_4$centers

barplot(centers[1,], las=2, col="darkblue")

barplot(centers[2,], las=2, col="darkblue")

barplot(centers[3,], las=2, col="darkblue")

barplot(centers[4,], las=2, col="darkblue")

centers=fit_3$centers

barplot(centers[1,], las=2, col="darkblue")

barplot(centers[2,], las=2, col="darkblue")

barplot(centers[3,], las=2, col="darkblue")

Visualization

fviz_cluster(fit_5, data = df_imput_n, geom = c("point"), ellipse.type = 'norm', pointsize = 1)+
  theme_minimal() + geom_text(label = name, hjust = 0, vjust = 0,size = 2, check_overlap = F) + scale_fill_brewer(palette = "Paired")
## Too few points to calculate an ellipse

fviz_cluster(fit_4, data = df_imput_n, geom = c("point"), ellipse.type = 'norm', pointsize = 1)+
  theme_minimal() + geom_text(label = name, hjust = 0, vjust = 0,size = 2, check_overlap = F) + scale_fill_brewer(palette = "Paired")

fviz_cluster(fit_3, data = df_imput_n, geom = c("point"), ellipse.type = 'norm', pointsize = 1)+
  theme_minimal() + geom_text(label = name, hjust = 0, vjust = 0,size = 2, check_overlap = F) + scale_fill_brewer(palette = "Paired")

Numbers of group

fviz_nbclust(scale(df_imput_n), kmeans, method = 'wss', k.max = 20, nstart = 1000)

# In this formula, the higher is the better
fviz_nbclust(scale(df_imput_n), kmeans, method = 'silhouette', k.max = 20, nstart = 1000)

# nboot : number of Bootsrapping
# if the line is on specific number then it means it is the maximum number of clusters
fviz_nbclust(scale(df_imput_n), kmeans, method = 'gap_stat', k.max = 10, nstart = 100, nboot = 100)

Insight

fit.km = kmeans(df_imput_n, centers = 3, nstart = 1000)

MAP

# Select here your favorite clustering tool
map = data.frame(country=name, value=fit.km$cluster)
#map = data.frame(country=names, value=fit.kmeans$cluster)

#Convert the country code into iso3c using the function countrycode()
map$country = countrycode(map$country, 'country.name', 'iso3c')
#Create data object supporting the map
matched <- joinCountryData2Map(map, joinCode = "ISO3",
                               nameJoinColumn = "country")
## 159 codes from your data successfully matched countries in the map
## 0 codes from your data failed to match with a country code in the map
## 84 codes from the map weren't represented in your data
#Draw the map
mapCountryData(matched,nameColumnToPlot="value",missingCountryCol = "white",
               borderCol = "#C7D9FF",
               catMethod = "pretty", colourPalette = "rainbow",
               mapTitle = c("Clusters"), lwd=1)
## You asked for 7 categories, 4 were used due to pretty() classification

d = dist(scale(df_imput_n), method = "euclidean")
hc = hclust(d, method = "ward.D2")

hc$labels <- name

fviz_dend(x = hc, 
          k=3,
          palette = "jco", 
          rect = TRUE, rect_fill = TRUE, cex=0.5,
          rect_border = "jco"          
)
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
##   Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

fviz_dend(x = hc,
          k = 3,
          color_labels_by_k = TRUE,
          cex = 0.8,
          type = "phylogenic",
          repel = TRUE)+  labs(title="Death reasons tree clustering of the world") + theme(axis.text.x=element_blank(),axis.text.y=element_blank())

groups.hc = cutree(hc, k = 8)

# Map our PCA index in a map:
map = data.frame(country=name, value=groups.hc)
#Convert the country code into iso3c using the function countrycode()
map$country = countrycode(map$country, 'country.name', 'iso3c')
#Create data object supporting the map
matched <- joinCountryData2Map(map, joinCode = "ISO3",
                               nameJoinColumn = "country")
## 159 codes from your data successfully matched countries in the map
## 0 codes from your data failed to match with a country code in the map
## 84 codes from the map weren't represented in your data
#Draw the map
mapCountryData(matched,nameColumnToPlot="value",missingCountryCol = "white",
               borderCol = "#C7D9FF",
               catMethod = "pretty", colourPalette = "rainbow",
               mapTitle = c("Clusters"), lwd=1)

heatmap

heatmap(scale(df_imput_n), scale = "none", labRow = name,
        distfun = function(x){dist(x, method = "euclidean")},
        hclustfun = function(x){hclust(x, method = "ward.D2")},
        cexRow = 0.7)